cancer_df = read_csv("./data/Cancer_Registry.csv") %>%
janitor::clean_names() %>%
select(target_death_rate, everything()) %>%
separate(geography, c("county", "state"), sep = ",") %>%
mutate(county = as.factor(county),
state = as.factor(state),
pct_case_count = avg_ann_count / pop_est2015*100000,
pct_hs = pct_no_hs18_24 + pct_hs18_24,
pct_bach_deg = pct_bach_deg18_24 + pct_bach_deg25_over) %>%
filter(median_age<100) %>%
select(target_death_rate, pct_case_count, everything())
## Parsed with column specification:
## cols(
## .default = col_double(),
## avgDeathsPerYear = col_integer(),
## medIncome = col_integer(),
## popEst2015 = col_integer(),
## binnedInc = col_character(),
## Geography = col_character()
## )
## See spec(...) for full column specifications.
Percentage of annul case dignosed count plot
plot_count_pct =
cancer_df %>%
ggplot(aes(y = pct_case_count, x = target_death_rate, color = state)) +
geom_point()
#geom_smooth(se = F)
ggplotly(plot_count_pct)
Incidence rate plot
plot_incidence =
cancer_df %>%
ggplot(aes(x = incidence_rate, y = target_death_rate, color = state)) +
geom_point()
#geom_smooth(se = F)
ggplotly(plot_incidence)
# Influential points in the dataset, state Flordia and Virginia.
Income plot
plot_income =
cancer_df %>%
ggplot(aes(x = med_income, y = target_death_rate, color = state)) +
geom_point()
#geom_smooth(se = F)
ggplotly(plot_income)
Age plots
plot_age_1 =
cancer_df %>%
ggplot(aes(x = median_age, y = target_death_rate)) +
geom_point() +
geom_smooth(se = F)
ggplotly(plot_age_1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# error data in this column, larger than 100
cancer_df %>%
filter(median_age < 100) %>%
ggplot(aes(x = median_age)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot_age_2 =
cancer_df %>%
ggplot(aes(x = median_age_male, y = target_death_rate)) +
geom_point() +
geom_smooth(se = F)
ggplotly(plot_age_2)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
plot_age_3 =
cancer_df %>%
ggplot(aes(x = median_age_female, y = target_death_rate)) +
geom_point() +
geom_smooth(se = F)
ggplotly(plot_age_3)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
cancer_df %>%
select(-county, -state, -binned_inc) %>%
cor() %>%
as.tibble()
## # A tibble: 35 x 35
## target_death_ra… pct_case_count avg_ann_count avg_deaths_per_…
## <dbl> <dbl> <dbl> <dbl>
## 1 1 -0.0551 -0.143 -0.0904
## 2 -0.0551 1 0.161 -0.0589
## 3 -0.143 0.161 1 0.940
## 4 -0.0904 -0.0589 0.940 1
## 5 0.448 0.0230 0.0742 0.0631
## 6 -0.428 0.0278 0.269 0.223
## 7 -0.119 -0.0518 0.927 0.978
## 8 0.429 -0.123 -0.135 -0.0667
## 9 -0.0225 -0.00419 0.0819 0.0633
## 10 -0.00429 0.124 -0.122 -0.145
## # ... with 25 more rows, and 31 more variables: incidence_rate <dbl>,
## # med_income <dbl>, pop_est2015 <dbl>, poverty_percent <dbl>,
## # study_per_cap <dbl>, median_age <dbl>, median_age_male <dbl>,
## # median_age_female <dbl>, avg_household_size <dbl>,
## # percent_married <dbl>, pct_no_hs18_24 <dbl>, pct_hs18_24 <dbl>,
## # pct_some_col18_24 <dbl>, pct_bach_deg18_24 <dbl>, pct_hs25_over <dbl>,
## # pct_bach_deg25_over <dbl>, pct_employed16_over <dbl>,
## # pct_unemployed16_over <dbl>, pct_private_coverage <dbl>,
## # pct_private_coverage_alone <dbl>, pct_emp_priv_coverage <dbl>,
## # pct_public_coverage <dbl>, pct_public_coverage_alone <dbl>,
## # pct_white <dbl>, pct_black <dbl>, pct_asian <dbl>,
## # pct_other_race <dbl>, pct_married_households <dbl>, birth_rate <dbl>,
## # pct_hs <dbl>, pct_bach_deg <dbl>
lm(target_death_rate ~ incidence_rate + med_income * pct_bach_deg25_over + pct_unemployed16_over + pct_public_coverage_alone , data = cancer_df) %>%
summary()
##
## Call:
## lm(formula = target_death_rate ~ incidence_rate + med_income *
## pct_bach_deg25_over + pct_unemployed16_over + pct_public_coverage_alone,
## data = cancer_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -111.623 -11.673 -0.075 11.770 138.862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.240e+02 6.571e+00 18.871 < 2e-16 ***
## incidence_rate 2.158e-01 6.785e-03 31.803 < 2e-16 ***
## med_income -7.202e-04 9.862e-05 -7.303 3.59e-13 ***
## pct_bach_deg25_over -2.901e+00 2.399e-01 -12.094 < 2e-16 ***
## pct_unemployed16_over 8.016e-01 1.430e-01 5.605 2.27e-08 ***
## pct_public_coverage_alone 3.322e-01 1.095e-01 3.035 0.00243 **
## med_income:pct_bach_deg25_over 2.636e-05 4.197e-06 6.281 3.85e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.18 on 3010 degrees of freedom
## Multiple R-squared: 0.4715, Adjusted R-squared: 0.4704
## F-statistic: 447.5 on 6 and 3010 DF, p-value: < 2.2e-16